MegaMUGA Annotations

# Reading in "control genotype" data
megamuga_controls <- data.table::fread("../data/MMControls/MegaMUGA genotypes CONTROLS.csv")
control_genotypes <- data.table::fread("../data/MMControls/control.genotypes.csv")
## Warning in data.table::fread("../data/MMControls/control.genotypes.csv"):
## Detected 364 column names but the data has 365 columns (i.e. invalid file).
## Added 1 extra default column name for the first column which is guessed to be
## row names or an index. Use setnames() afterwards if this guess is not correct,
## or fix the file write command that created the file to create a valid file.
colnames(control_genotypes)[1] <- "marker"

## Reading in marker annotations
mm_metadata <- data.table::fread("../data/MMControls/mm_uwisc_v2.csv")

## Calculating allele frequencies for each marker
control_allele_freqs <- control_genotypes %>%
  tidyr::pivot_longer(-marker, names_to = "sample", values_to = "genotype") %>%
  dplyr::group_by(marker, genotype) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::mutate(freq = round(n/sum(n), 3),
                genotype = as.factor(genotype)) %>%
  dplyr::left_join(., mm_metadata)

First, we searched for probes where many mice are missing genotype calls.

## Filtering to markers with missing genotypes
no.calls <- control_allele_freqs %>%
  dplyr::filter(genotype == "N") %>%
  tidyr::pivot_wider(names_from = genotype, values_from = n) %>%
  dplyr::select(marker, chr, bp_grcm39, freq) %>%
  dplyr::mutate(chr = as.factor(chr))
cutoff <- quantile(no.calls$freq, probs = seq(0,1,0.05))[[20]]
above.cutoff <- no.calls %>%
  dplyr::filter(freq > cutoff)

Of 77808 markers, 54990 failed to genotype at least one sample, and 2750 markers failed to genotype at least 12.765% of samples.

In a similar fashion, we calculated the number of reference samples with missing genotypes. Repeated observations of samples/strains meant that genotype counts for each marker among them couldn’t be grouped and tallied, so determining no-call frequency occurred column-wise.

n.calls.strains <- apply(X = control_genotypes[,2:ncol(control_genotypes)], MARGIN = 2, function(x) table(x)[5])
n.calls.strains.df <- data.frame(n.calls.strains)
n.calls.strains.df$sample <- names(n.calls.strains)
n.calls.strains.df %<>%
  dplyr::rename(n.no.calls = n.calls.strains)
sampleQC <- ggplot(n.calls.strains.df, 
                   mapping = aes(x = reorder(sample,n.no.calls), 
                                 y = n.no.calls,
                                 text = paste("Sample:", sample))) + 
  geom_point() +
  QCtheme + 
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  labs(x = "Number of mice with missing genotypes",
       y = "Number of markers")
ggplotly(sampleQC, tooltip = c("text","y"))